{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 打开页面"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-1-3847d274efc0>:12: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts)\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "\n",
    "# driver = webdriver.Firefox()\n",
    "driver = webdriver.Chrome( chrome_options = opts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "from lxml.html import fromstring\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import time\n",
    "from random import random\n",
    "import requests_html\n",
    "from requests_html import HTMLSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//a[@class=\"login__type__container__select-type\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "payload={\"account\":\"2363578598@qq.com\",\"password\":\"12345678yyy\"}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 输入账户密码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//input[@name=\"account\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.clear()\n",
    "element.send_keys(payload['account'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//input[@name=\"password\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.clear()\n",
    "element.send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//a[@class=\"btn_login\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 展开"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#展开\n",
    "element = driver.find_element_by_xpath('//*[@id=\"mp_header\"]/div/div/a')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#点击图文素材\n",
    "element = driver.find_element_by_xpath('//*[@id=\"menuBar\"]/li[2]/ul/li[1]/a')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击+\n",
    "element = driver.find_element_by_xpath('//*[@id=\"js_main\"]/div[3]/div[2]/div/div/div/div[1]/div/div/div[1]')\n",
    "element.click()\n",
    "# 如果失败可使用浏览器悬停功能"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#新的图文\n",
    "element = driver.find_element_by_xpath('//*[@id=\"js_main\"]/div[3]/div[2]/div/div/div/div[1]/div/div/div[2]/ul/li[1]/a')\n",
    "\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 切换页面"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-ADCEF8551A97EDA2C511DE97FBC07332',\n",
       " 'CDwindow-6979533B540E1B986BCE4C5CD85AF741']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-14-6c6d5ce6602d>:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 超链接\n",
    "element = driver.find_element_by_xpath('//*[@id=\"js_editor_insertlink\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 搜索公众号 爬取信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "#选择其他公众号\n",
    "element = driver.find_element_by_xpath('/html/body/div[2]/div/div/div/div/div[6]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/p/div/button')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "#点击南方都市报\n",
    "driver.find_element_by_xpath('/html/body/div[2]/div/div/div/div/div[6]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div/span/input').clear\n",
    "driver.find_element_by_xpath('/html/body/div[2]/div/div/div/div/div[6]/div[2]/div[1]/div/div[2]/div[2]/form[1]/div[3]/div/div/div/div/span/input').send_keys(\"南方都市报\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-icon weui-desktop-icon__search weui-desktop-icon__small\" style=\"width: 20px; height: 20px;\"><!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!---->     <svg viewBox=\"0 0 24 24\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><title>MP/Icon/Search</title> <g id=\"MP/Icon/Search\" stroke=\"none\" stroke-width=\"1\" fill=\"none\" fill-rule=\"evenodd\"><path d=\"M5.78025253,5.78248558 C8.51392257,3.04881554 12.9460774,3.04881554 15.6797475,5.78248558 C18.1730922,8.27583028 18.3922898,12.1821488 16.3373403,14.9239313 L20.6294949,19.2175144 L19.2152814,20.631728 L14.922508,16.3389663 C12.180685,18.394566 8.27384272,18.1755707 5.78025253,15.6819805 C3.04658249,12.9483105 3.04658249,8.51615562 5.78025253,5.78248558 Z M6.8409127,6.84314575 C4.6930291,8.99102935 4.6930291,12.4734367 6.8409127,14.6213203 C8.98879631,16.7692039 12.4712037,16.7692039 14.6190873,14.6213203 C16.7669709,12.4734367 16.7669709,8.99102935 14.6190873,6.84314575 C12.4712037,4.69526215 8.98879631,4.69526215 6.8409127,6.84314575 Z\" id=\"形状\"></path></g></svg> <!----> <!----> <!----> <!----> <!----></div>\n"
     ]
    }
   ],
   "source": [
    "# 点放大镜搜\n",
    "element = driver.find_element_by_xpath('//button[@class=\"weui-desktop-icon-btn weui-desktop-search__btn\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/3OEpTPib0kVicrzDWicH7JWJpiacbysXllOEbCSNUJCZxEZEhgP51W1Y8om1ZHyxlfMw7dBSw2IgDWCjshddggeiaFQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">南方都市报</strong> <i class=\"inner_link_account_wechat\">微信号：nddaily</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/zzoUJzlKT41tz4jfPPzxI6nWsXQdo458HkQRw6RfAWbBIPVT9NcibYTzYGQQn3l6Fs5dLwenuwBMyS03S28rOqg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">爱南方</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">服务号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/jribTUbtKkD0sjMf7v2lhz06ZhFcdW38XdraMAfNB49dbqgibCzZx3E6xwDxa1WxaKlaHK3n0yWic6P6P3CibYwVwQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">都市报童</strong> <i class=\"inner_link_account_wechat\">微信号：dushibaotong</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/zlrHiaWN66fpUy0XLwTteT1ibM39FwW8ylgEFghOuOd4SWwCX3IdDWoVaHaErYbIWU4ic1ibXSI4DQVGqZ9g6pS0Vg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">正點观影</strong> <i class=\"inner_link_account_wechat\">微信号：nd_ent</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "公众号SERP = main_content\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 解析\n",
    "root = fromstring(公众号SERP) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "主 = root.xpath('//li[@class=\"inner_link_account_item\"]')\n",
    "\n",
    "account_list = []\n",
    "for e in 主:\n",
    "    account_nickname = e.xpath('./div/strong[@class=\"inner_link_account_nickname\"]')[0].text\n",
    "    account_wechat = e.xpath('./div/i[@class=\"inner_link_account_wechat\"]')[0].text\n",
    "    account_img = e.xpath('./div/img/@src')[0]\n",
    "    account = {\"nickname\": account_nickname, \"wechat\": account_wechat, \"img\": account_img,}\n",
    "    account_list.append(account)\n",
    "\n",
    "df_account = pd.DataFrame(account_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nickname</th>\n",
       "      <th>wechat</th>\n",
       "      <th>img</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>南方都市报</td>\n",
       "      <td>微信号：nddaily</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/3OEpTPib0kVicrz...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>爱南方</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/zzoUJzlKT41tz4j...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>都市报童</td>\n",
       "      <td>微信号：dushibaotong</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/jribTUbtKkD0sjM...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>正點观影</td>\n",
       "      <td>微信号：nd_ent</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/zlrHiaWN66fpUy0...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  nickname            wechat  \\\n",
       "0    南方都市报       微信号：nddaily   \n",
       "1      爱南方           微信号：未设置   \n",
       "2     都市报童  微信号：dushibaotong   \n",
       "3     正點观影        微信号：nd_ent   \n",
       "\n",
       "                                                 img  \n",
       "0  http://mmbiz.qpic.cn/mmbiz_png/3OEpTPib0kVicrz...  \n",
       "1  http://mmbiz.qpic.cn/mmbiz_png/zzoUJzlKT41tz4j...  \n",
       "2  http://mmbiz.qpic.cn/mmbiz_png/jribTUbtKkD0sjM...  \n",
       "3  http://mmbiz.qpic.cn/mmbiz_png/zlrHiaWN66fpUy0...  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/3OEpTPib0kVicrzDWicH7JWJpiacbysXllOEbCSNUJCZxEZEhgP51W1Y8om1ZHyxlfMw7dBSw2IgDWCjshddggeiaFQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">南方都市报</strong> <i class=\"inner_link_account_wechat\">微信号：nddaily</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]/li')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n跳转_input = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/input\\')\\n跳转_a = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/a\\')\\n跳转_input.clear()\\n跳转_input.send_keys(2)\\n跳转_a.click()\\n'"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 跳转testing\n",
    "'''\n",
    "跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "跳转_input.clear()\n",
    "跳转_input.send_keys(2)\n",
    "跳转_a.click()\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 1336]\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# 跳转上限\n",
    "l_e = driver.find_elements_by_xpath('//label[@class=\"weui-desktop-pagination__num\"]')\n",
    "l_e_int  = [int(x.text) for x in l_e] \n",
    "print (l_e_int)\n",
    "print (l_e_int[0]==l_e_int[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(l_e_int[0],l_e_int[-1]+1 ))\n",
    "# print(pages[0:2])\n",
    "pages = list(range(1,l_e_int[-1]+1 ))\n",
    "pages=pages[0:55]\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 循环/遍历"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# global varialbes \n",
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "\n",
    "        跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "        跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "        跳转_input.clear()\n",
    "        跳转_input.send_keys(p)\n",
    "        跳转_a.click()\n",
    "\n",
    "        time.sleep(80+200*random())\n",
    "\n",
    "        element = driver.find_element_by_xpath('//div[@class=\"inner_link_article_list\"]')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "#         print(main_content)\n",
    "        html_raw[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t41\t42\t43\t44\t45\t46\t47\t48\t49\t50\t51\t52\t53\t54\t55\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "2   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "3   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "4   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "5   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "6   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "7   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "8   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "9   <div class=\"weui-desktop-radio-group\"><label c...\n",
       "10  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "11  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "12  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "13  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "14  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "15  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "16  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "17  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "18  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "19  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "20  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "21  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "22  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "23  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "24  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "25  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "26  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "27  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "28  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "29  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "30  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "31  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "32  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "33  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "34  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "35  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "36  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "37  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "38  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "39  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "40  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "41  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "42  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "43  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "44  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "45  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "46  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "47  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "48  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "49  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "50  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "51  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "52  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "53  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "54  <div class=\"weui-desktop-radio-group\"><label c...\n",
       "55  <div class=\"weui-desktop-radio-group\"><label c..."
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stored 'html_raw' (dict)\n"
     ]
    }
   ],
   "source": [
    "%store html_raw\n",
    "import pickle \n",
    "filehandler = open(\"html_raw\", 'wb') \n",
    "pickle.dump(html_raw, filehandler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "55\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [html_snippets]\n",
       "Index: []"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 找出重复项并删除\n",
    "df_out = df[~df.duplicated()]\n",
    "print (len(df_out))\n",
    "df[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try_again = list(df[df.duplicated()].index)\n",
    "print(try_again)\n",
    "try_again = try_again + list (set(pages).difference(set(df.index.values)))\n",
    "try_again"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 暂存档"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = { \"output\" : { \"公众号_htm_snippets\": \"公众号_htm_snippets_{公众号}.tsv\",\n",
    "                    \"公众号_df\": \"公众号_df_{公众号}.tsv\",\n",
    "                    \"公众号_xlsx\": \"公众号_url_{公众号}.xlsx\" } \\\n",
    "      }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "公众号 = \"南方都市报\"\n",
    "filename = fn [\"output\"] [\"公众号_htm_snippets\"] \n",
    "df_out.to_csv(filename.format(公众号=公众号), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "39,40,40,40,40,40,40,40,40,40,40,40,39,39,40,40,40,40,40,40,40,39,40,40,40,40,40,40,40,40,40,39,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,39,39,40,38,40,"
     ]
    }
   ],
   "source": [
    "def get_content(link):\n",
    "    session = HTMLSession()\n",
    "    r = session.get(url=link)\n",
    "    content_xpath_1 = '//*[@id=\"js_content\"]//span/text()'\n",
    "    content_xpath_2 = '//*[@id=\"js_content\"]//p/text()'\n",
    "    content_1 = ''.join(r.html.xpath(content_xpath_1))\n",
    "    content_2 = ''.join(r.html.xpath(content_xpath_2))\n",
    "    return content_1 + content_2\n",
    "\n",
    "def parse_html_snippets(_snippet_):\n",
    "    root = fromstring(_snippet_) \n",
    "    title = [x for x in root.xpath('//div[@class=\"inner_link_article_title\"]//span[2]/text()')]\n",
    "    create_time = [x.text for x in root.xpath('//div[@class=\"inner_link_article_date\"]')]\n",
    "    link = [x for x in root.xpath('//a/@href')]\n",
    "    content_text = [get_content(x) for x in link]\n",
    "    _df_ = pd.DataFrame({\"title\":title, \"create_time\": create_time, \"link\":link, \"content_text\":content_text})\n",
    "    return(_df_)\n",
    "    \n",
    "l_df = []\n",
    "for p in pages[0:55]:\n",
    "    _df_ = parse_html_snippets(df.loc[p,\"html_snippets\"])\n",
    "    print (len(_df_), end=\",\")\n",
    "    l_df.append(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>广州最新！7地铁站只出不进！南浦全岛检测！荔湾隔离居民安排来了</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>今早（5月29日），《广州市新型冠状病毒肺炎疫情防控指挥部关于新冠肺炎疫情分级分类防控工作的...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>佛山刚刚通报！1岁女童确诊</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>据广东卫健委，5月28日0-24时，广东全省新增2例本土，广州报告1例，为无症状感染者转确诊...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>中国老太远赴法国，后续令人惊叹……</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>·</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>冯小刚已赔1.68亿</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>本周三（5月26日），因冯小刚对赌失败一事，华谊兄弟再被推上热搜：冯小刚已向华谊支付1.68...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>​冲上热搜！这100份申请书，震撼心灵</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>钱学森、钱三强等人入党申请书长啥样？他们是怎么跟党组织“交心”？为何不少高龄科学家也要入党？...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>受贿150,000,000余元！一“警虎”受审</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>5月27日，63岁的内蒙古“警虎”马明站在山东省淄博市中级人民法院被告席上，他。马明当庭表示...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>4名港澳籍青年，入职广州公务员</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>近日，。据悉，除了在公务员招考时定向录取内地高校毕业的港澳优秀应届生，广州还正在试点在公职机...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>爆款电影《哪吒》被诉抄袭，一审判了</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>还记得“我命由我不由天”的国产动漫电影《哪吒之魔童降世》（下称《哪吒》）吗？因认为《哪吒》在...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>周知！广州多地中小学暂停线下教学！五街道每户每天一人外出、暂停堂食</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>广州荔湾区四街道调整为中风险根据广东省新冠肺炎防控指挥办《关于印发广东省应对新型冠状病毒肺炎...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>深夜，华师辟谣</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>网传华南师范大学2020级某学生有中风险地区旅居史隐瞒不报、出现发热症状并确诊新冠肺炎等情况...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>薇娅道歉</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>昨天（5月28日），薇娅及其团队就此前直播间的Supreme联名小风扇一事发布了致歉声明，“...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                title create_time  \\\n",
       "0     广州最新！7地铁站只出不进！南浦全岛检测！荔湾隔离居民安排来了  2021-05-29   \n",
       "1                       佛山刚刚通报！1岁女童确诊  2021-05-29   \n",
       "2                   中国老太远赴法国，后续令人惊叹……  2021-05-29   \n",
       "3                          冯小刚已赔1.68亿  2021-05-29   \n",
       "4                 ​冲上热搜！这100份申请书，震撼心灵  2021-05-29   \n",
       "5             受贿150,000,000余元！一“警虎”受审  2021-05-29   \n",
       "6                     4名港澳籍青年，入职广州公务员  2021-05-29   \n",
       "7                   爆款电影《哪吒》被诉抄袭，一审判了  2021-05-29   \n",
       "8   周知！广州多地中小学暂停线下教学！五街道每户每天一人外出、暂停堂食  2021-05-29   \n",
       "9                             深夜，华师辟谣  2021-05-29   \n",
       "10                               薇娅道歉  2021-05-29   \n",
       "\n",
       "                                                 link  \\\n",
       "0   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "3   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "4   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "5   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "6   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "7   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "8   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "9   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "10  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "\n",
       "                                         content_text  \n",
       "0   今早（5月29日），《广州市新型冠状病毒肺炎疫情防控指挥部关于新冠肺炎疫情分级分类防控工作的...  \n",
       "1   据广东卫健委，5月28日0-24时，广东全省新增2例本土，广州报告1例，为无症状感染者转确诊...  \n",
       "2                                                   ·  \n",
       "3   本周三（5月26日），因冯小刚对赌失败一事，华谊兄弟再被推上热搜：冯小刚已向华谊支付1.68...  \n",
       "4   钱学森、钱三强等人入党申请书长啥样？他们是怎么跟党组织“交心”？为何不少高龄科学家也要入党？...  \n",
       "5   5月27日，63岁的内蒙古“警虎”马明站在山东省淄博市中级人民法院被告席上，他。马明当庭表示...  \n",
       "6   近日，。据悉，除了在公务员招考时定向录取内地高校毕业的港澳优秀应届生，广州还正在试点在公职机...  \n",
       "7   还记得“我命由我不由天”的国产动漫电影《哪吒之魔童降世》（下称《哪吒》）吗？因认为《哪吒》在...  \n",
       "8   广州荔湾区四街道调整为中风险根据广东省新冠肺炎防控指挥办《关于印发广东省应对新型冠状病毒肺炎...  \n",
       "9   网传华南师范大学2020级某学生有中风险地区旅居史隐瞒不报、出现发热症状并确诊新冠肺炎等情况...  \n",
       "10  昨天（5月28日），薇娅及其团队就此前直播间的Supreme联名小风扇一事发布了致歉声明，“...  "
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out.loc[0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2186</th>\n",
       "      <td>广深磁悬浮预留线路，首次曝光！</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>备受关注的广深磁悬浮建设线路有了最新消息。2月24日，在广州市推进全球重要交通枢纽建设情况发...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2187</th>\n",
       "      <td>遇刺女法官被追授“全国模范”</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>南都记者了解到，日前，人力资源社会保障部、最高人民法院联合印发决定，追授湖南省高级人民法院审...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2188</th>\n",
       "      <td>小霸王文化发展公司涉非法吸收公众存款！警方通报</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>2月27日，广东中山市公安局发布《关于小霸王文化发展有限公司涉嫌非法吸收公众存款案的公告》。...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2189</th>\n",
       "      <td>小孩网购糖果，顺手“清空购物车”！妈妈吓一跳</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>不少家长平时喜欢把手机给家里的小孩玩，日前，深圳市民梁女士就因为家里的孩子拿她的手机一番操作...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2190</th>\n",
       "      <td>被骗，被坑！一定要做这件事</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>“3·15”国际消费者权益日将近“守护安全 畅通消费”是中消协2021年消费维权主题过去一年...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        title create_time  \\\n",
       "2186          广深磁悬浮预留线路，首次曝光！  2021-02-27   \n",
       "2187           遇刺女法官被追授“全国模范”  2021-02-27   \n",
       "2188  小霸王文化发展公司涉非法吸收公众存款！警方通报  2021-02-27   \n",
       "2189   小孩网购糖果，顺手“清空购物车”！妈妈吓一跳  2021-02-27   \n",
       "2190            被骗，被坑！一定要做这件事  2021-02-27   \n",
       "\n",
       "                                                   link  \\\n",
       "2186  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2187  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2188  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2189  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2190  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "\n",
       "                                           content_text  \n",
       "2186  备受关注的广深磁悬浮建设线路有了最新消息。2月24日，在广州市推进全球重要交通枢纽建设情况发...  \n",
       "2187  南都记者了解到，日前，人力资源社会保障部、最高人民法院联合印发决定，追授湖南省高级人民法院审...  \n",
       "2188  2月27日，广东中山市公安局发布《关于小霸王文化发展有限公司涉嫌非法吸收公众存款案的公告》。...  \n",
       "2189  不少家长平时喜欢把手机给家里的小孩玩，日前，深圳市民梁女士就因为家里的孩子拿她的手机一番操作...  \n",
       "2190  “3·15”国际消费者权益日将近“守护安全 畅通消费”是中消协2021年消费维权主题过去一年...  "
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out.tail(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>value</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>广州最新！7地铁站只出不进！南浦全岛检测！荔湾隔离居民安排来了</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>今早（5月29日），《广州市新型冠状病毒肺炎疫情防控指挥部关于新冠肺炎疫情分级分类防控工作的...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>佛山刚刚通报！1岁女童确诊</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>据广东卫健委，5月28日0-24时，广东全省新增2例本土，广州报告1例，为无症状感染者转确诊...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>中国老太远赴法国，后续令人惊叹……</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>·</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>冯小刚已赔1.68亿</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>本周三（5月26日），因冯小刚对赌失败一事，华谊兄弟再被推上热搜：冯小刚已向华谊支付1.68...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>​冲上热搜！这100份申请书，震撼心灵</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>钱学森、钱三强等人入党申请书长啥样？他们是怎么跟党组织“交心”？为何不少高龄科学家也要入党？...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2185</th>\n",
       "      <td>银保监会回应“中国人寿被举报大量造假”</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>近日，中国人寿嫩江支公司女员工在网络实名举报“中国人寿大量造假”，引发热议。昨晚（26日），...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2186</th>\n",
       "      <td>广深磁悬浮预留线路，首次曝光！</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>备受关注的广深磁悬浮建设线路有了最新消息。2月24日，在广州市推进全球重要交通枢纽建设情况发...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2187</th>\n",
       "      <td>遇刺女法官被追授“全国模范”</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>南都记者了解到，日前，人力资源社会保障部、最高人民法院联合印发决定，追授湖南省高级人民法院审...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2188</th>\n",
       "      <td>小霸王文化发展公司涉非法吸收公众存款！警方通报</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>2月27日，广东中山市公安局发布《关于小霸王文化发展有限公司涉嫌非法吸收公众存款案的公告》。...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2189</th>\n",
       "      <td>小孩网购糖果，顺手“清空购物车”！妈妈吓一跳</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>不少家长平时喜欢把手机给家里的小孩玩，日前，深圳市民梁女士就因为家里的孩子拿她的手机一番操作...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2017 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 title create_time  \\\n",
       "value                                                \n",
       "0      广州最新！7地铁站只出不进！南浦全岛检测！荔湾隔离居民安排来了  2021-05-29   \n",
       "1                        佛山刚刚通报！1岁女童确诊  2021-05-29   \n",
       "2                    中国老太远赴法国，后续令人惊叹……  2021-05-29   \n",
       "3                           冯小刚已赔1.68亿  2021-05-29   \n",
       "4                  ​冲上热搜！这100份申请书，震撼心灵  2021-05-29   \n",
       "...                                ...         ...   \n",
       "2185               银保监会回应“中国人寿被举报大量造假”  2021-02-27   \n",
       "2186                   广深磁悬浮预留线路，首次曝光！  2021-02-27   \n",
       "2187                    遇刺女法官被追授“全国模范”  2021-02-27   \n",
       "2188           小霸王文化发展公司涉非法吸收公众存款！警方通报  2021-02-27   \n",
       "2189            小孩网购糖果，顺手“清空购物车”！妈妈吓一跳  2021-02-27   \n",
       "\n",
       "                                                    link  \\\n",
       "value                                                      \n",
       "0      http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1      http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2      http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "3      http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "4      http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "...                                                  ...   \n",
       "2185   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2186   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2187   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2188   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2189   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "\n",
       "                                            content_text  \n",
       "value                                                     \n",
       "0      今早（5月29日），《广州市新型冠状病毒肺炎疫情防控指挥部关于新冠肺炎疫情分级分类防控工作的...  \n",
       "1      据广东卫健委，5月28日0-24时，广东全省新增2例本土，广州报告1例，为无症状感染者转确诊...  \n",
       "2                                                      ·  \n",
       "3      本周三（5月26日），因冯小刚对赌失败一事，华谊兄弟再被推上热搜：冯小刚已向华谊支付1.68...  \n",
       "4      钱学森、钱三强等人入党申请书长啥样？他们是怎么跟党组织“交心”？为何不少高龄科学家也要入党？...  \n",
       "...                                                  ...  \n",
       "2185   近日，中国人寿嫩江支公司女员工在网络实名举报“中国人寿大量造假”，引发热议。昨晚（26日），...  \n",
       "2186   备受关注的广深磁悬浮建设线路有了最新消息。2月24日，在广州市推进全球重要交通枢纽建设情况发...  \n",
       "2187   南都记者了解到，日前，人力资源社会保障部、最高人民法院联合印发决定，追授湖南省高级人民法院审...  \n",
       "2188   2月27日，广东中山市公安局发布《关于小霸王文化发展有限公司涉嫌非法吸收公众存款案的公告》。...  \n",
       "2189   不少家长平时喜欢把手机给家里的小孩玩，日前，深圳市民梁女士就因为家里的孩子拿她的手机一番操作...  \n",
       "\n",
       "[2017 rows x 4 columns]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# tagging 标记\n",
    "tagging_list = [\"\",\"马化腾\", \"腾讯\", \"微众银行\",\"腾讯复星\",\"腾讯风控\",\"腾讯支付\",\"WeChat\",\"We Remit\",\\\n",
    "                \"公益日\",\"红包\",\\\n",
    "                \"腾讯财付通\",\"鹅厂\",\"QQ钱包\",\"QQ红包\",\"QQ\",\\\n",
    "                \"只有一\",\"大咖\",\"听说\",\"图片\",\"照片\",\"小编\",\\\n",
    "                \"洗钱\", \"黑产\",\"被骗\",\"腾讯安全课\",\"诈骗\", \"炒股\",\"神秘兼职\",\"神秘组织\",\\\n",
    "                \"财付通\", \"品牌\",\\\n",
    "                \"收款\",\\\n",
    "                \"报告\",\\\n",
    "                \"银行卡\",\"理财\",\"选股\",\"发票\",\"基金\",\\\n",
    "                \"区块链\",\"金融云\",\"O2O\",\"农产品\",\"家乡\",\\\n",
    "                \"数据\", \"数据赋能\", \"智能\", \"数字孪生\", \"智慧大脑\",\\\n",
    "                \"出行\",\"乘车\",\"公交\",\"乘车码\", \"智慧地铁\",\\\n",
    "                \"高峰论坛\", \"智库\",\\\n",
    "                \"央行\",\"新规\", \\\n",
    "                \"微信\", \"微信支付\", \"跨境支付\", \"移动支付\",\"非银行支付\",\"电子支付\",\\\n",
    "                \"互联网金融\", \"金融科技\",\"互联网＋\",\"互联网+金融\",\"普惠金融\",\"虚拟银行\",\\\n",
    "                \"开放\",\"生态\",\"复杂\",\"互联网思维\",\"全球合作伙伴\",\\\n",
    "                \"联合国\", \"城市\", \"粤港澳大湾区\", \"平台\", \"可持续发展\", \"未来\", \"绿色\",\\\n",
    "                \"医护\",\"防护服\",\"小时\",\"武汉\",\"危机\",\"新冠肺炎\", \"疫\", \"疫情\", \"复工\",\"停课\",\"宅经济\",\\\n",
    "                \"基建\",\"新基建\"] #overwritable\n",
    "\n",
    "v_v_list = []\n",
    "\n",
    "for tag in tagging_list:\n",
    "    index_list = df_url_out [ df_url_out.title.str.contains(tag) ].index.to_list()\n",
    "    v_v_pairs = pd.DataFrame({tag:index_list}).melt().set_index(\"value\")\n",
    "    v_v_list.append(v_v_pairs)\n",
    "\n",
    "df_cat = v_v_list[0]\n",
    "for d in v_v_list:\n",
    "    df_cat.update(d)\n",
    "    \n",
    "# 尚未标记内容\n",
    "df_url_out.loc [ df_cat.query('variable==\"\"').index ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ==&mid=2651044761&idx=7&sn=02ffbf121a31db99e16e96da7da3e7af&chksm=4794b87770e33161a2f645c872255aa735e23e6d3ebb1501ff0d67b382ae4a6a041c8c9e5864#rd'"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out.loc[53].link"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [title, create_time, link, content_text]\n",
       "Index: []"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>广州最新！7地铁站只出不进！南浦全岛检测！荔湾隔离居民安排来了</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>今早（5月29日），《广州市新型冠状病毒肺炎疫情防控指挥部关于新冠肺炎疫情分级分类防控工作的...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>佛山刚刚通报！1岁女童确诊</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>据广东卫健委，5月28日0-24时，广东全省新增2例本土，广州报告1例，为无症状感染者转确诊...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>中国老太远赴法国，后续令人惊叹……</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>·</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>冯小刚已赔1.68亿</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>本周三（5月26日），因冯小刚对赌失败一事，华谊兄弟再被推上热搜：冯小刚已向华谊支付1.68...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>​冲上热搜！这100份申请书，震撼心灵</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>钱学森、钱三强等人入党申请书长啥样？他们是怎么跟党组织“交心”？为何不少高龄科学家也要入党？...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2186</th>\n",
       "      <td>广深磁悬浮预留线路，首次曝光！</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>备受关注的广深磁悬浮建设线路有了最新消息。2月24日，在广州市推进全球重要交通枢纽建设情况发...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2187</th>\n",
       "      <td>遇刺女法官被追授“全国模范”</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>南都记者了解到，日前，人力资源社会保障部、最高人民法院联合印发决定，追授湖南省高级人民法院审...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2188</th>\n",
       "      <td>小霸王文化发展公司涉非法吸收公众存款！警方通报</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>2月27日，广东中山市公安局发布《关于小霸王文化发展有限公司涉嫌非法吸收公众存款案的公告》。...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2189</th>\n",
       "      <td>小孩网购糖果，顺手“清空购物车”！妈妈吓一跳</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>不少家长平时喜欢把手机给家里的小孩玩，日前，深圳市民梁女士就因为家里的孩子拿她的手机一番操作...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2190</th>\n",
       "      <td>被骗，被坑！一定要做这件事</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>“3·15”国际消费者权益日将近“守护安全 畅通消费”是中消协2021年消费维权主题过去一年...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2191 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                title create_time  \\\n",
       "0     广州最新！7地铁站只出不进！南浦全岛检测！荔湾隔离居民安排来了  2021-05-29   \n",
       "1                       佛山刚刚通报！1岁女童确诊  2021-05-29   \n",
       "2                   中国老太远赴法国，后续令人惊叹……  2021-05-29   \n",
       "3                          冯小刚已赔1.68亿  2021-05-29   \n",
       "4                 ​冲上热搜！这100份申请书，震撼心灵  2021-05-29   \n",
       "...                               ...         ...   \n",
       "2186                  广深磁悬浮预留线路，首次曝光！  2021-02-27   \n",
       "2187                   遇刺女法官被追授“全国模范”  2021-02-27   \n",
       "2188          小霸王文化发展公司涉非法吸收公众存款！警方通报  2021-02-27   \n",
       "2189           小孩网购糖果，顺手“清空购物车”！妈妈吓一跳  2021-02-27   \n",
       "2190                    被骗，被坑！一定要做这件事  2021-02-27   \n",
       "\n",
       "                                                   link  \\\n",
       "0     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "3     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "4     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "...                                                 ...   \n",
       "2186  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2187  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2188  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2189  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2190  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "\n",
       "                                           content_text  \n",
       "0     今早（5月29日），《广州市新型冠状病毒肺炎疫情防控指挥部关于新冠肺炎疫情分级分类防控工作的...  \n",
       "1     据广东卫健委，5月28日0-24时，广东全省新增2例本土，广州报告1例，为无症状感染者转确诊...  \n",
       "2                                                     ·  \n",
       "3     本周三（5月26日），因冯小刚对赌失败一事，华谊兄弟再被推上热搜：冯小刚已向华谊支付1.68...  \n",
       "4     钱学森、钱三强等人入党申请书长啥样？他们是怎么跟党组织“交心”？为何不少高龄科学家也要入党？...  \n",
       "...                                                 ...  \n",
       "2186  备受关注的广深磁悬浮建设线路有了最新消息。2月24日，在广州市推进全球重要交通枢纽建设情况发...  \n",
       "2187  南都记者了解到，日前，人力资源社会保障部、最高人民法院联合印发决定，追授湖南省高级人民法院审...  \n",
       "2188  2月27日，广东中山市公安局发布《关于小霸王文化发展有限公司涉嫌非法吸收公众存款案的公告》。...  \n",
       "2189  不少家长平时喜欢把手机给家里的小孩玩，日前，深圳市民梁女士就因为家里的孩子拿她的手机一番操作...  \n",
       "2190  “3·15”国际消费者权益日将近“守护安全 畅通消费”是中消协2021年消费维权主题过去一年...  \n",
       "\n",
       "[2191 rows x 4 columns]"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[~df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "      <th>variable</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>广州最新！7地铁站只出不进！南浦全岛检测！荔湾隔离居民安排来了</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>今早（5月29日），《广州市新型冠状病毒肺炎疫情防控指挥部关于新冠肺炎疫情分级分类防控工作的...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>佛山刚刚通报！1岁女童确诊</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>据广东卫健委，5月28日0-24时，广东全省新增2例本土，广州报告1例，为无症状感染者转确诊...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>中国老太远赴法国，后续令人惊叹……</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>·</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>冯小刚已赔1.68亿</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>本周三（5月26日），因冯小刚对赌失败一事，华谊兄弟再被推上热搜：冯小刚已向华谊支付1.68...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>​冲上热搜！这100份申请书，震撼心灵</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>钱学森、钱三强等人入党申请书长啥样？他们是怎么跟党组织“交心”？为何不少高龄科学家也要入党？...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2186</th>\n",
       "      <td>广深磁悬浮预留线路，首次曝光！</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>备受关注的广深磁悬浮建设线路有了最新消息。2月24日，在广州市推进全球重要交通枢纽建设情况发...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2187</th>\n",
       "      <td>遇刺女法官被追授“全国模范”</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>南都记者了解到，日前，人力资源社会保障部、最高人民法院联合印发决定，追授湖南省高级人民法院审...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2188</th>\n",
       "      <td>小霸王文化发展公司涉非法吸收公众存款！警方通报</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>2月27日，广东中山市公安局发布《关于小霸王文化发展有限公司涉嫌非法吸收公众存款案的公告》。...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2189</th>\n",
       "      <td>小孩网购糖果，顺手“清空购物车”！妈妈吓一跳</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>不少家长平时喜欢把手机给家里的小孩玩，日前，深圳市民梁女士就因为家里的孩子拿她的手机一番操作...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2190</th>\n",
       "      <td>被骗，被坑！一定要做这件事</td>\n",
       "      <td>2021-02-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>“3·15”国际消费者权益日将近“守护安全 畅通消费”是中消协2021年消费维权主题过去一年...</td>\n",
       "      <td>被骗</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2191 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                title create_time  \\\n",
       "0     广州最新！7地铁站只出不进！南浦全岛检测！荔湾隔离居民安排来了  2021-05-29   \n",
       "1                       佛山刚刚通报！1岁女童确诊  2021-05-29   \n",
       "2                   中国老太远赴法国，后续令人惊叹……  2021-05-29   \n",
       "3                          冯小刚已赔1.68亿  2021-05-29   \n",
       "4                 ​冲上热搜！这100份申请书，震撼心灵  2021-05-29   \n",
       "...                               ...         ...   \n",
       "2186                  广深磁悬浮预留线路，首次曝光！  2021-02-27   \n",
       "2187                   遇刺女法官被追授“全国模范”  2021-02-27   \n",
       "2188          小霸王文化发展公司涉非法吸收公众存款！警方通报  2021-02-27   \n",
       "2189           小孩网购糖果，顺手“清空购物车”！妈妈吓一跳  2021-02-27   \n",
       "2190                    被骗，被坑！一定要做这件事  2021-02-27   \n",
       "\n",
       "                                                   link  \\\n",
       "0     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "3     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "4     http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "...                                                 ...   \n",
       "2186  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2187  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2188  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2189  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "2190  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "\n",
       "                                           content_text variable  \n",
       "0     今早（5月29日），《广州市新型冠状病毒肺炎疫情防控指挥部关于新冠肺炎疫情分级分类防控工作的...     无法分类  \n",
       "1     据广东卫健委，5月28日0-24时，广东全省新增2例本土，广州报告1例，为无症状感染者转确诊...     无法分类  \n",
       "2                                                     ·     无法分类  \n",
       "3     本周三（5月26日），因冯小刚对赌失败一事，华谊兄弟再被推上热搜：冯小刚已向华谊支付1.68...     无法分类  \n",
       "4     钱学森、钱三强等人入党申请书长啥样？他们是怎么跟党组织“交心”？为何不少高龄科学家也要入党？...     无法分类  \n",
       "...                                                 ...      ...  \n",
       "2186  备受关注的广深磁悬浮建设线路有了最新消息。2月24日，在广州市推进全球重要交通枢纽建设情况发...     无法分类  \n",
       "2187  南都记者了解到，日前，人力资源社会保障部、最高人民法院联合印发决定，追授湖南省高级人民法院审...     无法分类  \n",
       "2188  2月27日，广东中山市公安局发布《关于小霸王文化发展有限公司涉嫌非法吸收公众存款案的公告》。...     无法分类  \n",
       "2189  不少家长平时喜欢把手机给家里的小孩玩，日前，深圳市民梁女士就因为家里的孩子拿她的手机一番操作...     无法分类  \n",
       "2190  “3·15”国际消费者权益日将近“守护安全 畅通消费”是中消协2021年消费维权主题过去一年...       被骗  \n",
       "\n",
       "[2191 rows x 5 columns]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_o = df_url_out.join(df_cat).replace(\"\", np.nan).fillna(\"无法分类\")\n",
    "df_o"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "      <th>variable</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>腾讯爱奇艺优酷同时发声：严厉谴责B站！</td>\n",
       "      <td>2021-05-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>长视频行业再次联合就版权问题发声。昨天（5月28日）下午3点50分，三大长视频平台腾讯视频、...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>权威认证背后：一群腾讯“安全侠”的“信任”江湖</td>\n",
       "      <td>2021-05-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>‍腾讯安全的六位零信任“安全侠”‍腾讯零信任安全管理系统 iOAForrester为腾讯零信...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>515</th>\n",
       "      <td>抖音火山版被判赔腾讯800万</td>\n",
       "      <td>2021-05-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>因火山小视频上的游戏直播涉嫌侵害腾讯著作权，腾讯将火山小视频（现更名为“抖音火山版”）诉至法...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>659</th>\n",
       "      <td>腾讯/万达/滴滴等合共领罚650万</td>\n",
       "      <td>2021-05-02</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>4月30日，市场监管总局发布公告称，根据《中华人民共和国反垄断法》规定，对九起违法实施经营者...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>779</th>\n",
       "      <td>卖破解版Switch，被腾讯索赔100万，判了</td>\n",
       "      <td>2021-04-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>上周四（4月22日），广州市越秀区人民法院对一批知识产权案件集中进行一审公开宣判，其中包括腾...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1104</th>\n",
       "      <td>今天，腾讯/京东/拼多多/百度等巨头被喊去开会！</td>\n",
       "      <td>2021-04-13</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>今天（4月13日），市场监管总局会同中央网信办、税务总局召开互联网平台企业行政指导会。会议贯...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1194</th>\n",
       "      <td>腾讯优酷爱奇艺联合声明：将维权！</td>\n",
       "      <td>2021-04-10</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>昨天（4月9日）下午，包括行业协会、影视公司及视频平台在内的超过70家机构发布联合声明，表示...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1218</th>\n",
       "      <td>腾讯第一大股东减持！套现千亿</td>\n",
       "      <td>2021-04-09</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>前天（4月7日），（南非报业Naspers子公司）宣布将通过附属公司MIH TC Holdi...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1314</th>\n",
       "      <td>继爱奇艺后，腾讯视频也涨价了</td>\n",
       "      <td>2021-04-05</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>又一家视频平台会员涨价。近日，腾讯视频VIP的官方微博账号发布通知称，将于2021年4月10...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1338</th>\n",
       "      <td>侵入并控制腾讯系统，这伙人获刑！</td>\n",
       "      <td>2021-04-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>3月31日，广州市南沙区人民法院公开宣判被告人陈某展等38人提供侵入、非法控制计算机信息系统...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1396</th>\n",
       "      <td>腾讯一员工自杀被救，遗书引热议</td>\n",
       "      <td>2021-04-01</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>3月30日，有媒体报道腾讯一员工参与网络跨境赌博输近500万，在腾讯内部论坛留了一篇遗书后自...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1550</th>\n",
       "      <td>“腾讯员工人均年薪81万”上热搜！网友评论太扎心</td>\n",
       "      <td>2021-03-26</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>前日（3月24日）#腾讯员工2020年人均年薪81万#登上微博热搜▼平均年薪81万这一数据从...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1728</th>\n",
       "      <td>腾讯阿里小米等11家企业被约谈</td>\n",
       "      <td>2021-03-18</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>据微信公众号“网信中国”今天（18日）上午消息：针对近期未履行安全评估程序的语音社交软件和涉...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1857</th>\n",
       "      <td>抖音诉腾讯垄断案有新进展</td>\n",
       "      <td>2021-03-13</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>抖音诉腾讯垄断一案有了新进展。南都记者了解到，3月5日，腾讯向北京知识产权法院提出管辖权异议...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1872</th>\n",
       "      <td>腾讯/百度/好未来等12家企业被罚50万</td>\n",
       "      <td>2021-03-12</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>今日（12日），国家市场监督管理总局在其官网通报，对互联网领域十起违法实施经营者集中案作出行...</td>\n",
       "      <td>未来</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1945</th>\n",
       "      <td>腾讯致歉：下架！退钱！</td>\n",
       "      <td>2021-03-09</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...</td>\n",
       "      <td>前天（3月7日），腾讯网游加速器官方微博发布了《关于腾讯网游加速器下架单机游戏离线版账号的声...</td>\n",
       "      <td>腾讯</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         title create_time  \\\n",
       "15         腾讯爱奇艺优酷同时发声：严厉谴责B站！  2021-05-29   \n",
       "40     权威认证背后：一群腾讯“安全侠”的“信任”江湖  2021-05-28   \n",
       "515             抖音火山版被判赔腾讯800万  2021-05-08   \n",
       "659          腾讯/万达/滴滴等合共领罚650万  2021-05-02   \n",
       "779    卖破解版Switch，被腾讯索赔100万，判了  2021-04-27   \n",
       "1104  今天，腾讯/京东/拼多多/百度等巨头被喊去开会！  2021-04-13   \n",
       "1194          腾讯优酷爱奇艺联合声明：将维权！  2021-04-10   \n",
       "1218            腾讯第一大股东减持！套现千亿  2021-04-09   \n",
       "1314            继爱奇艺后，腾讯视频也涨价了  2021-04-05   \n",
       "1338          侵入并控制腾讯系统，这伙人获刑！  2021-04-04   \n",
       "1396           腾讯一员工自杀被救，遗书引热议  2021-04-01   \n",
       "1550  “腾讯员工人均年薪81万”上热搜！网友评论太扎心  2021-03-26   \n",
       "1728           腾讯阿里小米等11家企业被约谈  2021-03-18   \n",
       "1857              抖音诉腾讯垄断案有新进展  2021-03-13   \n",
       "1872      腾讯/百度/好未来等12家企业被罚50万  2021-03-12   \n",
       "1945               腾讯致歉：下架！退钱！  2021-03-09   \n",
       "\n",
       "                                                   link  \\\n",
       "15    http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "40    http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "515   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "659   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "779   http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1104  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1194  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1218  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1314  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1338  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1396  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1550  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1728  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1857  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1872  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "1945  http://mp.weixin.qq.com/s?__biz=MTk1MjIwODAwMQ...   \n",
       "\n",
       "                                           content_text variable  \n",
       "15    长视频行业再次联合就版权问题发声。昨天（5月28日）下午3点50分，三大长视频平台腾讯视频、...       腾讯  \n",
       "40    ‍腾讯安全的六位零信任“安全侠”‍腾讯零信任安全管理系统 iOAForrester为腾讯零信...       腾讯  \n",
       "515   因火山小视频上的游戏直播涉嫌侵害腾讯著作权，腾讯将火山小视频（现更名为“抖音火山版”）诉至法...       腾讯  \n",
       "659   4月30日，市场监管总局发布公告称，根据《中华人民共和国反垄断法》规定，对九起违法实施经营者...       腾讯  \n",
       "779   上周四（4月22日），广州市越秀区人民法院对一批知识产权案件集中进行一审公开宣判，其中包括腾...       腾讯  \n",
       "1104  今天（4月13日），市场监管总局会同中央网信办、税务总局召开互联网平台企业行政指导会。会议贯...       腾讯  \n",
       "1194  昨天（4月9日）下午，包括行业协会、影视公司及视频平台在内的超过70家机构发布联合声明，表示...       腾讯  \n",
       "1218  前天（4月7日），（南非报业Naspers子公司）宣布将通过附属公司MIH TC Holdi...       腾讯  \n",
       "1314  又一家视频平台会员涨价。近日，腾讯视频VIP的官方微博账号发布通知称，将于2021年4月10...       腾讯  \n",
       "1338  3月31日，广州市南沙区人民法院公开宣判被告人陈某展等38人提供侵入、非法控制计算机信息系统...       腾讯  \n",
       "1396  3月30日，有媒体报道腾讯一员工参与网络跨境赌博输近500万，在腾讯内部论坛留了一篇遗书后自...       腾讯  \n",
       "1550  前日（3月24日）#腾讯员工2020年人均年薪81万#登上微博热搜▼平均年薪81万这一数据从...       腾讯  \n",
       "1728  据微信公众号“网信中国”今天（18日）上午消息：针对近期未履行安全评估程序的语音社交软件和涉...       腾讯  \n",
       "1857  抖音诉腾讯垄断一案有了新进展。南都记者了解到，3月5日，腾讯向北京知识产权法院提出管辖权异议...       腾讯  \n",
       "1872  今日（12日），国家市场监督管理总局在其官网通报，对互联网领域十起违法实施经营者集中案作出行...       未来  \n",
       "1945  前天（3月7日），腾讯网游加速器官方微博发布了《关于腾讯网游加速器下架单机游戏离线版账号的声...       腾讯  "
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_o[df_o.title.str.contains(\"腾讯\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>variable</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>无法分类</th>\n",
       "      <td>2017</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>疫</th>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>微信</th>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>腾讯</th>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>品牌</th>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>小时</th>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>平台</th>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>疫情</th>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>报告</th>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>照片</th>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>武汉</th>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>城市</th>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>公交</th>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>被骗</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>停课</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>出行</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>未来</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>危机</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>数据</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>智能</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>诈骗</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>银行卡</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>绿色</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>新规</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>马化腾</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>理财</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>生态</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>乘车</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>新冠肺炎</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>红包</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>微信支付</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>联合国</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>开放</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>家乡</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>央行</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>听说</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>防护服</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>QQ</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          title\n",
       "variable       \n",
       "无法分类       2017\n",
       "疫            24\n",
       "微信           17\n",
       "腾讯           15\n",
       "品牌           11\n",
       "小时           10\n",
       "平台            9\n",
       "疫情            7\n",
       "报告            7\n",
       "照片            6\n",
       "武汉            6\n",
       "城市            6\n",
       "公交            6\n",
       "被骗            5\n",
       "停课            5\n",
       "出行            4\n",
       "未来            4\n",
       "危机            3\n",
       "数据            3\n",
       "智能            3\n",
       "诈骗            2\n",
       "银行卡           2\n",
       "绿色            2\n",
       "新规            2\n",
       "马化腾           2\n",
       "理财            1\n",
       "生态            1\n",
       "乘车            1\n",
       "新冠肺炎          1\n",
       "红包            1\n",
       "微信支付          1\n",
       "联合国           1\n",
       "开放            1\n",
       "家乡            1\n",
       "央行            1\n",
       "听说            1\n",
       "防护服           1\n",
       "QQ            1"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_stats = df_o.groupby(by=\"variable\").agg({\"title\":\"count\"}).sort_values(by=\"title\", ascending=False)\n",
    "df_stats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 输出"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_account.columns.name = \"rel_accounts\"\n",
    "df_o.columns.name = \"url_cat\"\n",
    "df_stats.columns.name = \"stats\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "_df_.columns.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the xlsxwriter workbook and worksheet objects.  \n",
    "with pd.ExcelWriter(fn[\"output\"][\"公众号_xlsx\"].format(公众号=公众号)) as writer:\n",
    "    workbook  = writer.book\n",
    "\n",
    "    for _df_ in [df_account, df_o, df_stats]:\n",
    "        _df_.to_excel(writer, sheet_name = _df_.columns.name)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "206.181px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
